clear all 
set more off 
set maxvar 15000 
clear matrix

/*-----------------------------------------------------------------------------
	Purpose: Clean and code all cross sections of NLSY66 from 1968-2003.
	Creates: nlsyw68_newxsec.dta

	Note: In order to observe family income around 40 for most respondents,
	      the 1988 (cleaned) cross section is selected for main analysis. As 
	      the median age in the 1968 cross section was 18, respondents would have 
	      reached a median age of 40 around 1989. The closest survey year to 
	      1989 is 1988.
----------------------------------------------------------------------------- */

cd "$Mydirectory1/1_DataSources/NLSYW68/"

* Import supplementary data and save dta files
quietly run "./rawdata/hhsize-value-labels.do"
quietly run "./rawdata/altmotherocc_Rage14-value-labels.do"
quietly run "./rawdata/momwork_Rage14-value-labels.do"
quietly run "./rawdata/classofwrker-value-labels.do"
quietly run "./rawdata/identify_siblings-value-labels.do"

* Import data and run NLS-provided clean up
run  "./rawdata/NLSYW68-ISM-value-labels.do"
sort R0000100_1968

*------------------------------------------------------------------------ 
*	 Merge in other relevant variables
*------------------------------------------------------------------------ 
	
	* Household size 
	merge 1:1 R0000100_1968 using "./rawdata/hhsize.dta"
	drop _merge
	
	* Mother occupation (alternate version) & who R lived with at age 14
	merge 1:1 R0000100_1968 using "./rawdata/altmotherocc_Rage14.dta"
	drop _merge

	* Whether mother worked when R was young
	merge 1:1 R0000100_1968 using "./rawdata/momwork_Rage14"
	drop _merge	

	*Class of worker (i.e., self-employed, private, gov't, without pay)
	merge 1:1 R0000100_1968 using "./rawdata/classofwrker"
	drop _merge	

	*Sibling ID code
	merge 1:1 R0000100_1968 using "./rawdata/sibs"
	drop _merge	

*------------------------------------------------------------------------ 
*	 Create dummy variable for 1988 respondents 
*------------------------------------------------------------------------ 
	gen interviewed1988=R1229400_1988!=-5

*------------------------------------------------------------------------ 
*	 RENAME AND CLEAN VARIABLES
*------------------------------------------------------------------------ 

*----------------------
*	Interview information	
*----------------------
	* Interview month (available all years)
		renvars R0002200_1968 R0085100_1969 R0145100_1970 R0251800_1971	R0333100_1972 R0415000_1973 R0517500_1975 ///
			R0546700_1977 R0587000_1978 R0709600_1980 R0756100_1982 R0802900_1983 R0947000_1985 R1062500_1987 ///
			R1108900_1988 R1232400_1991 R1364900_1993 DOI_MM_1995 DOI_MM_1997 DOI_MM_1999 DOI_MM_2001 DOI_MM_2003  ///
			, map("doim" + substr("@", -4,4)) 
		qui:mvdecode doim*, mv(-5 -4 -3 -2 -1)

	* Interview year (available all years)
		renvars R0002400_1968 R0085300_1969 R0145250_1970 R0252000_1971 R0333600_1972 R0756000_1982 R0802800_1983 ///
			R0946900_1985 R1108800_1988 R1232300_1991 DOI_YYYY_1995 DOI_YYYY_1997 DOI_YYYY_1999 DOI_YYYY_2001 ///
			DOI_YYYY_2003 ///			
			, map("doiy" + substr("@", -4,4)) 

		/* Note: Some survey waves do not provide a variable for interview year. 
		         Will create these variables for respondents who report a non-missing
		         survey month. */
			gen doiy1973=1973 if !mi(doim1973)
			gen doiy1975=1975 if !mi(doim1975)
			gen doiy1977=1977 if !mi(doim1977)
			gen doiy1978=1978 if !mi(doim1978)
			gen doiy1980=1978 if !mi(doim1980)
			gen doiy1987=1987 if !mi(doim1987)
			gen doiy1993=1993 if !mi(doim1993)
		qui:mvdecode doiy*, mv(-5 -4 -3 -2 -1)

	* Cross-sectional sampling weights
		//1968-1969
		rename R0000200_1968 weight1968
		rename R0085410_1969 weight1969

		//1970-2003
		renvars SWT* , map("weight" + substr("@", 4,4))  
		qui:mvdecode weight*, mv(-5 -4 -3 -2 -1)
		sum  weight*  

*-------------------
*	Id variables
*-------------------
	* Identification code
	rename R0000100_1968 id
	unique id

	rename R0000300_1968 hhid

*----------------
* Unions 
*----------------
	/*Notes: (1) The union variable changes after 1995. Will only
	             code union status through 1995, as we only need 
	             this variable in the 1988 cross-section.
	         (2) Union variable (we use) available from 1970-1995.
	         (3) Survey question is: "Are wages on 
	             current job set by collective bargaining?"
	*/

		renvars R0159500_1970 R0266500_1971 R0348800_1972 R0430000_1973 R0554900_1977  R0719200_1980 R0762300_1982 R0810800_1983 ///
				R1069500_1987 R1117400_1988 R1240700_1991 R1374700_1993 ///			
				, map("union" + substr("@", -4,4)) 
		qui:mvdecode union*, mv(-5 -4 -3 -2 -1)

*----------------
* Demographics
*----------------

*Respondents
	*Sex 
		gen sex=2
		gen female=1

	* Year of respondent birth (1968 version)
		gen doby = 1900 + R0042012_1968 if R0042012_1968 > 0

	* Month of respondent birth (1968 version)
		rename R0042010_1968 dobm
		qui:mvdecode doby dobm, mv(-5 -4 -3 -2 -1)	
			codebook doby dobm

	* Race (1968 version)
		/*Note: See https://nlsinfo.org/content/cohorts/mature-and-young-women/topical-guide/household/race-ethnicity-nationality
				for more info on race and ethnicity in NLSYW68.
		*/
		gen race = R0003200_1968 
		qui:mvdecode race, mv(-5 -4 -3 -2 -1)	
		fre race

		gen black=race==2

	* Marital status
		/* Note: There are two versions of marital status: 
		         (1) revised (by NLS) and 
		         (2) current status without corrections

		         We will used (1) first, and 
		         when it is unavailable, we will use (2). */

		//Version 1: revised (preferred)
		renvars R0042051_1968 R0116651_1969 R0197050_1970 R0311951_1971 R0396251_1972 R0480551_1973 ///
				HRC_3F2_1995 HRC_3F2_1997 HRC_3F2_1999 HRC_3F2_2001 HRC_3F2_2003    ///			
			, map("revimar" + substr("@", -4,4)) 

		//Version 2: current status 
		renvars R0003300_1968 R0086200_1969 R0146100_1970 R0334400_1972 R0519600_1975 R0581000_1977 ///
			    R0587500_1978 R0710000_1980 R0756500_1982 R0940700_1983 R0947400_1985 R1063000_1987  ///
				R1221000_1988 R1352700_1991 R1569900_1993 MARST_1995 MARST_1997 MARST_1999 MARST_2001 MARST_2003		///			
			, map("initmar" + substr("@", -4,4)) 
		qui:mvdecode initmar* revimar*, mv(-5 -4 -3 -2 -1)	
		
		//Harmonized version
			forvalues y = 1968/2003{
				capture confirm variable revimar`y'

				if !_rc {
					gen marital_status`y'=revimar`y'
					capture confirm variable initmar`y'
						if !_rc {
							replace marital_status`y'=initmar`y' if mi(marital_status`y') & !mi(initmar`y')
						}
				}

				else {
					capture confirm variable initmar`y'						
						if !_rc {
							gen marital_status`y'=initmar`y' 
						}						
				}
			}

	* Region of residence (1968 version)
		clonevar  grewup_south = R0003306_1968

	* Region of residence (in current survey year)
		/* Note: available from 1968-2013*/
		renvars R0003306_1968  R0086206_1969 R0146106_1970 R0253306_1971 R0333606_1972 R0415506_1973 R0517906_1975 ///
				R0546916_1977  R0586106_1978  R0753400_1980 R0800600_1982 R0943300_1983 R1056800_1985 R1105700_1987 ///
				R1229400_1988  R1361500_1991  R1579000_1993 REGION_1995 REGION_1997 REGION_1999 REGION_2001 REGION_2003 ///
			, map("region_residence" + substr("@", -4,.)) 
		qui:mvdecode region_residence*, mv(-5 -4 -3 -2 -1 9)	
		sum region_residence*

	* Foreign-born
		gen foreignborn=R0037600_1968==5 & !mi(R0037600_1968)

*Parents
	* Who R was living with at age 14
		clonevar Rwho_livew_age14 =R0039100_1968
		qui:mvdecode Rwho_livew_age14, mv(-5 -4 -3 -2 -1)
	
	* Foreign born 
		gen fatherforeign = R0038400_1968!=1 & !mi(R0038400_1968)
		gen motherforeign = R0038500_1968!=1 & !mi(R0038500_1968)

*-------------------------------
* Education
*------------------------------- 
* Respondent
	/* Note: There are two versions of highest grade completed: 
	         (1) revised (by NLS) and 
	         (2) current status without corrections

	         We will use (1) first, and 
	         when it is unavailable, we will use (2). */

		renvars R0072000_1968 R0134600_1969 R0212600_1970 R0326200_1971 R0521100_1975 R0590600_1978 R1215100_1988 R1346400_1991 ///
				EAT_5_1995 EAT_5_1997 EAT_5_1999 EAT_5_2001 EAT_5_2003  ///
				, map("inithgc" + substr("@", -4,4)) 

		renvars R0130900_1969 R0211400_1970 R0325540_1971 R0409900_1972 R0494300_1973 R0543010_1975 R0574400_1977 R0666390_1978 ///
				R0749910_1980 R0797110_1982 R0929510_1983 R1051610_1985 R1097410_1987 R1215110_1988 R1346410_1991 R1520410_1993 ///
				HIGHEST_GRADE_COMP_1995 HIGHEST_GRADE_COMP_1997 HIGHEST_GRADE_COMP_1999 HIGHEST_GRADE_COMP_2001 HIGHEST_GRADE_COMP_2003 ///
				, map("revihgc" + substr("@", -4,4)) 
		qui:mvdecode inithgc* revihgc* , mv(-5 -4 -3 -2 -1 95 98 97 96)

	//Correcting some mistakes in version (2) from 1995-2003
		forvalues y = 1995/2003{
			capture confirm variable inithgc`y'
			if !_rc {	
				replace inithgc`y'=. if inithgc`y'==11 
				replace inithgc`y'=inithgc`y'+8 				
			}
		}	
		sum inithgc* revihgc*

	//Harmonized version of HGC
			forvalues y = 1968/2003{
				capture confirm variable revihgc`y'
				if !_rc {
					gen hgc`y'=revihgc`y'
					capture confirm variable inithgc`y'
					if !_rc {
						replace hgc`y'=inithgc`y' if mi(hgc`y') & !mi(inithgc`y')
					}
				}
			}

*Parents
	*Highest grade completed by R father (available in 1968)
		clonevar hgc_dad = R0065600_1968

	* Highest grade completed by R mother (available in 1968)
		clonevar hgc_mom = R0067600_1968

		qui:mvdecode hgc_dad hgc_mom , mv(-5 -4 -3 -2 -1)
		fre  hgc_dad hgc_mom 

*-------------------------------
* Siblings
*------------------------------- 
	foreach var of varlist R0002101_1968 R0002120_1968 R0002130_1968 R0002140_1968 {
		if "`var'"=="R0002101_1968" {
			local num 1
			local lab "1st"
		}
		if "`var'"=="R0002120_1968" {
			local num 2
			local lab "2nd"
		}
		if "`var'"=="R0002130_1968" {
			local num 3
			local lab "3rd"
		}
		if "`var'"=="R0002140_1968" {
			local num 4
			local lab "4th"
		}

		ren `var' idcode_sister`num'_yw
		label var idcode_sister`num' "ID code of `lab' sister in NLSYW (taken from YW survey)"
	}

	foreach var of varlist R0002150_1968 R0002151_1968 R0002152_1968 R0002153_1968 R0002154_1968 R0002155_1968 	{
		if "`var'"=="R0002150_1968" {
			local num 1
			local lab "1st"
		}
		if "`var'"=="R0002151_1968" {
			local num 2
			local lab "2nd"
		}
		if "`var'"=="R0002152_1968" {
			local num 3
			local lab "3rd"
		}
		if "`var'"=="R0002153_1968" {
			local num 4
			local lab "4th"
		}
		if "`var'"=="R0002154_1968" {
			local num 5
			local lab "5th"
		}
		if "`var'"=="R0002155_1968" {
			local num 6
			local lab "6th"
		}

		ren `var' idcode_brother`num'_yw
		label var idcode_brother`num' "ID code of `lab' brother in NLSYM (taken from YW survey)"
	}

	*Dummy: R has a sibling in nlsym or nlsyw surveys
	lookfor idcode
    di "`r(varlist)'"
    local vars "`r(varlist)'"

    gen sib_nlsyw68 =.

    foreach v of local vars {
        di "`v'"
        replace sib_nlsyw68 =1 if `v'!=-4 & sib_nlsyw68==.
    }

    replace sib_nlsyw68 =0 if sib_nlsyw68==.
    tab sib_nlsyw68, m

*-------------------------------
* HH size
*------------------------------- 
	renvars R0070600_1968 R0134351_1969 R0212571_1970 R0326151_1971 R0414951_1972 R0502300_1973 R0649000_1978 R0752700_1980 R0766700_1982 R0878300_1983 R0960500_1985 R1081000_1987 R1184000_1988 R1313200_1991 R1421800_1993, map("R_hhsize_plusR"+ substr("@",-4,4))
	
	foreach num of numlist 68/73 78(2)82 83 85 87 88 91 93   {
		tab R_hhsize_plusR19`num', m
	}
	mvdecode R_hhsize_plusR* , mv(-5 -4 -2 -1)

	//TOPCODED VALUES
	/* Note: In the years below, household size should be topcoded at 17, 
	         but for some reason there are a couple "18" values. In other surveys, 
	         any values exceeding the topcoded value have been recoded to have the 
	         topcoded value. The same will be done here.*/	
	foreach num of numlist 68/72    {
		replace R_hhsize_plusR19`num' =17 if R_hhsize_plusR19`num'==18 
	}

*-------------------------------
* Employment 
*-------------------------------
*Respondent
	ren R1230700_1988 class_wrker_1988
	mvdecode class_wrker_1988, mv(-5 -4) 

*Father--not available

*-------------------------------
* Occupation 
*-------------------------------
*Respondent
	/* Note: There are two variables for "current/last job (3-digit Census code)". 
	         Will combine both to recover the most respondents. */
	
	    //Version 1: uncollapsed (1968-1993)
			renvars R0018700_1968 R0098800_1969 R0158200_1970 R0265200_1971 R0347500_1972 R0428600_1973 R0524100_1975 ///
					R0554000_1977 R0598600_1978 R0712700_1980 R0759300_1982 R0941500_1983 R1059400_1985 R1104400_1987 ///
					R1228700_1988 R1360100_1991 R1577800_1993 ///
			, map("initoccu" + substr("@", -4,4))

	    //Version 2: collapsed (1969-1993)
			renvars R0135800_1969 R0220100_1970 R0327000_1971 R0410700_1972 R0495200_1973 R0544300_1975 R0585400_1977 ///
					R0702600_1978 R0754200_1980 R0801300_1982 R0944000_1983 R1060400_1985 R1106400_1987 R1230800_1988 ///
					R1362400_1991 R1579900_1993  ///
			, map("finoccu" + substr("@", -4,4))
			qui:mvdecode initoccu* finoccu* , mv(-5 -4 -3 -2 -1)

		* Harmonized occupation variable

			//1968-1993
			gen finoccu1968=.
			forvalues y = 1968/1993{
				capture confirm variable finoccu`y'
				if !_rc {
					gen occu`y'=finoccu`y'
					capture confirm variable initoccu`y'
					if !_rc {
						replace occu`y'=initoccu`y' if mi(occu`y') & !mi(initoccu`y')
					}
				}
			}

			//1995-2003
			renvars  OCC00_*  ///
			, map("occujob_" + substr("@", -7,.))
			qui:mvdecode initoccu* finoccu* occujob_*  , mv(-5 -4 -3 -2 -1)
			foreach y in 1995 1997 1999 2001 2003 {
				egen occu`y'=rowfirst(occujob_*_`y') //keep occupation from first reported job	
			}


*Parents 
	* Whether dad worked when R age 14 (available in 1968)
		clonevar workdad_Rage14 = R0039300_1968
	
	* Occupation of father when R age 14 (available in 1968)
		clonevar occu_dad_r14=R0039200_1968
		qui:mvdecode occu_dad_r14 , mv( -2 -1)
		
	* Occupation of mother when R age 14 (1968 version)
		clonevar occu_mom_r14_v1=R0039500_1968
		qui:mvdecode occu_mom_r14_v1 , mv(-5 -4 -3 -2 -1)
	
	* Whether mom worked when R was a teenager (1978 version)
		clonevar workmom_Rage14 = R0697000_1978

	* Occupation of mother when R age 14 (1978 version)
		clonevar occu_mom_r14_v2=R0697100_1978
		qui: mvdecode occu_mom_r14_v2, mv(-2 -1)
			
*----------------
* Family Income
*----------------

*Respondent
	renvars R0037000_1968 R0113300_1969 R0193100_1970 R0308700_1971 R0392500_1972 ///
	        R0475100_1973 R0533300_1975 R0583400_1977 R0796500_1982 R0902000_1983 ///
			R1050000_1985 R1095800_1987 R1196200_1988 R1328300_1991 R1506300_1993 ///
			INC_21_1995 INC_21_1997 INC_21_1999 INC_21_2001 INC_21_2003 ///
	, map("famincv1_" + substr("@", -4,.))

	forvalues y = 1968/2003{
	capture confirm variable famincv1_`y'
		if !_rc {
			replace famincv1_`y'=. if famincv1_`y'<0
		}
	}

	foreach y in 1988 {
	capture confirm variable famincv1_`y'
		if !_rc {
			gen fam_inc`y'=.
			replace fam_inc`y'=0.75*4000 if famincv1_`y'==1 | famincv1_`y'==0 
			replace fam_inc`y'=5000  if famincv1_`y'==2
			replace fam_inc`y'=6750  if famincv1_`y'==3
			replace fam_inc`y'=8750  if famincv1_`y'==4
			replace fam_inc`y'=12500  if famincv1_`y'==5
			replace fam_inc`y'=16250  if famincv1_`y'==6
			replace fam_inc`y'=18750  if famincv1_`y'==7
			replace fam_inc`y'=22500  if famincv1_`y'==8
			replace fam_inc`y'=30000  if famincv1_`y'==9
			replace fam_inc`y'=42500  if famincv1_`y'==10
			replace fam_inc`y'=1.25*50000  if famincv1_`y'==11 
			replace fam_inc`y' = . if interviewed1988!=1
			
		gen bottomcoded_son_`y'=1 if fam_inc`y'==0.75*4000 
		gen topcoded_son_`y'=1 if fam_inc`y'==1.25*50000 	
		}
	}

* keep only individuals with year of birth information
	drop if mi(doby) 

*------------------------------------------------------------------------ 
*					RESHAPE TO LONG
*------------------------------------------------------------------------
local intvars="doim*  weight* id union* interviewed1988"
local demovars="sex female doby dobm race black marital_status* region_residence* grewup_south foreignborn fatherforeign motherforeign"
local edu="hgc_dad hgc_mom hgc*"
local occuvars="class_wrker occu* occu_dad_r14 occu_mom_r14_v1 occu_mom_r14_v2" 
local faminvars="Rwho_livew_age14 workdad_Rage14  workmom_Rage14 famincv1_* fam_inc* R_hhsize_plusR* bottomcoded_son_* topcoded_son_* sib_nlsyw68 idcode_*" //
keep  id hhid `intvars' `demovars' `edu' `occuvars' `faminvars'

qui: reshape long doim weight region_residence marital_status hgc occu fam_inc famincv1_  ///
					union  bottomcoded_son_ topcoded_son_ R_hhsize_plusR, i(id) j(doiy)
					
	replace topcoded_son_=0 if topcoded_son==.
	replace bottomcoded_son_=0 if bottomcoded_son==.

	drop occujob*

	gen year=doiy
	label var year "Survey year" 

*------------------------------------------------------------------------ 
*						GENERATE NEW VARS
*------------------------------------------------------------------------
des

*----------------------
* Interview information	
*----------------------
	sum doim doiy weight 

*-----------
* Id vars 
*-----------
	unique id 
	unique id year
	sum id

*----------------
* Unions 
*----------------
	rename union unionR
	lab var unionR "Wages set by union"
	lab define yesno 1 "Yes" 0 "No"
	lab values unionR yesno

*----------------
* Veterans 
*----------------
	gen veteran=.

*----------------
* Demographics
*----------------

*Respondent
	*Sex
		lab var sex "R's sex"

	*Age 
		/*Note: When birth month is missing, 
	            the median will be imputed.
		        Same for interview month. */
		sum dobm doim
		replace dobm=7 if mi(dobm)  
		replace doim=11 if mi(doim) 

		gen age = ym(doiy, doim) - ym(doby, dobm) 
		replace age = floor(age/12) 

		gen agesq = age * age

	* Birth cohorts	
		clonevar dob=doby 
		tab dob, m
		label var dob "Year of birth"

	* Birth decade
		gen decade = 10 * floor(dob/10)
		tab decade,m
		label var decade "Decade of birth"

		//Generate dummies for each decade
		tab decade, gen(decade_)

	* Race
		fre race
		replace race=. if race==3
		lab define race 1 "white" 2 "black" 3 "other"
		lab values race race

	* Foreign born
		fre foreignborn
		lab var foreignborn "R was born outside the US"

	* Marital status
		gen married= (marital_status==1 | marital_status==2) if marital_status<.
		tab married,m 
		
		gen never_married = marital_status==6 if marital_status<.
		tab never_married, m
		
		gen divorced = marital_status==4 if marital_status<.
		tab divorced, m 
		
		gen widowed = marital_status==3 if marital_status<.
		tab widowed, m 
		
		gen separated = marital_status==5 if marital_status<.
		tab separated, m 
		
*Parents
	* Foreign born
		fre fatherforeign
		lab var fatherforeign "R's father was born outside the US"

		fre motherforeign
		lab var motherforeign "R's mother was born outside the US"

*-------------------------------
* Education
*------------------------------- 

*Respondents
	* Categorical edu var
		fre hgc //highest grade/yr compltd & got credit
		gen eduR= 0 if hgc==0 
		replace eduR=1 if hgc>=1 & hgc<8  /* some grade school */ 
		replace eduR=2 if hgc==8  /* completed 8th grade */ 
		replace eduR=3 if hgc>8 & hgc<12  /* some HS */ 
		replace eduR=4 if hgc==12 /* 4 years of HS */ 
		replace eduR=5 if hgc>12 & hgc<16  /* 1-3 years of college */ 
		replace eduR=6 if hgc>15 & hgc<.  /* 4 or more years of college (like BA) */ 
		tab eduR, m

	* Years of school
		clonevar  yrsschool = hgc
		label var yrsschool "Years of school" 	
		tab yrsschool, m nol 

	* Years of school (binned)
		gen yrsschool_bin=. 
		replace yrsschool_bin = 0 if yrsschool==0 
		replace yrsschool_bin = 6 if yrsschool>0 & yrsschool<8  /*check this again!*/
		replace yrsschool_bin = 8 if yrsschool==8 
		replace yrsschool_bin = 10 if yrsschool>8 & yrsschool<12 
		replace yrsschool_bin = 12 if yrsschool==12 
		replace yrsschool_bin = 14 if yrsschool>12 & yrsschool<16 
		replace yrsschool_bin = 16 if yrsschool>=16 & yrsschool<20 
		tab yrsschool_bin , m
		label var yrsschool_bin "Years of school, binned"

	* High school degree
		gen hs_ed = eduR>=4 if !mi(eduR)
		tab hs_ed, m
		label var hs_ed "HS educated"

	* College education degree
		gen coll_ed = eduR>=6  if  !mi(eduR)
		tab coll_ed, m 
		label var coll_ed "Coll educated" 

*Parents
	fre hgc_mom hgc_dad
	clonevar dad_ed_raw=hgc_dad
	clonevar mom_ed_raw=hgc_mom	

		foreach name in mom dad { 
			gen edu_`name'=0 if `name'_ed_raw==0 //none
			replace edu_`name'=1 if `name'_ed_raw>0 & `name'_ed_raw<8 //some grade school
			replace edu_`name'=2 if `name'_ed_raw==8 //completed 8th grade
			replace edu_`name'=3 if `name'_ed_raw>8 & `name'_ed_raw<12 //some HS 
			replace edu_`name'=4 if `name'_ed_raw==12 //4 years of HS
			replace edu_`name'=5 if `name'_ed_raw>12 & `name'_ed_raw<16 //some college
			replace edu_`name'=6 if `name'_ed_raw==16 | `name'_ed_raw==17  //college
			replace edu_`name'=4 if `name'_ed_raw==25 
			label var edu_`name' "Educational categories for `name'" 
		} 
	* Binned
		foreach name in mom dad { 
			gen edu_`name'_bin=0 if edu_`name'==0 
			replace edu_`name'_bin=6 if edu_`name'==1 
			replace edu_`name'_bin=8 if edu_`name'==2 
			replace edu_`name'_bin=10 if edu_`name'==3 
			replace edu_`name'_bin=12 if edu_`name'==4 
			replace edu_`name'_bin=14 if edu_`name'==5 
			replace edu_`name'_bin=16 if edu_`name'==6 
			tab edu_`name'_bin, m
			label var edu_`name'_bin "`name' Years of school from bins"
		
		* High school degree		
			gen `name'_hs_ed = edu_`name'>=4 if edu_`name'<.
			tab `name'_hs_ed, m
		
		* College education degree
			gen `name'_coll_ed = edu_`name'>=6 if edu_`name'<.
			tab `name'_coll_ed, m 
			label var `name'_hs_ed "`name' HS educated" 
			label var `name'_coll_ed "`name' College educated"
		}

	* Years of school		
		foreach a in dad mom {
			rename `a'_ed_raw yrsschool_`a'
			tab yrsschool_`a', m
		} 

*-------------------------------
* HH size
*------------------------------- 
	gen R_hhsize_minusR = R_hhsize_plusR -1
	tab R_hhsize_minusR,m 
	
	label var R_hhsize_plusR "total # of persons in R's hh (including R)"
	label var R_hhsize_minusR "total # of persons in R's hh (NOT including R)"	

*-------------------------------
* Employment 
*-------------------------------
*Respondents
	gen selfemployed = (class_wrker_1988==4) if class_wrker_1988!=.
	tab selfemployed,m 

*-------------------------------
* Occupation 
*-------------------------------

	/*Notes: (1) The respondent occupation variable
				 uses 1960 Census occupation codes from 
				 1968-1993. From 1995-2003, the variable
				 uses 2000 Census occupation codes.
			 (2) The parental occupation variables use 1960 Census 
	             occupation codes.  */

*-----------------------------------
* Crosswalk occupations (parents) 
*-----------------------------------
	
	foreach name in dad_r14 mom_r14_v1 mom_r14_v2  {
	
		clonevar census1960= occu_`name'
		merge m:1 census1960 using ../Crosswalks/Crosswalk_1960Census_toANES
			tab census1960 if _merge==1,m 
			assert (census1960==. | census1960==-4 | census1960==-5) if _merge==1
			drop if _merge==2 
			drop _merge

		//Temporarily rename some variables
		if "`name'"=="dad_r14" rename fatheroccej father_occ1950ej 
		if "`name'"=="mom_r14_v1" {
			rename fatheroccej motheroccej1
			label var motheroccej1 "motheroccej--v1 (1968)"
		}
		if "`name'"=="mom_r14_v2" {
			rename fatheroccej motheroccej2
			label var motheroccej2 "motheroccej--v2 (1978)"
		}

		drop census1960 
	
	}
	
********************************************************************************************************
	* For fathers/mothers: Fix occupations for self-employed businessmen, managers, or officials before merging income scores * 
*******************************************************************************************************
/* Note: Not able to find class of worker for R's mother or father
         when R was growing up. */	

*----------------------------------------
* Crosswalk occupations (respondents) 
*----------------------------------------

	gen occu1960=occu if inrange(year, 1968,1993)
	gen occu2000=occu if inrange(year, 1995,2003)

	//1960 codes
	clonevar census1960=occu1960 
	merge m:1 census1960 using  ../Crosswalks/Crosswalk_1960Census_toANES.dta
		assert census1960==. if _merge==1
		drop if _merge==2
		drop _merge
	
		rename fatheroccej occej_from1960 

	//2000 codes
	clonevar occ2000=occu2000
	merge m:1 occ2000 using ../Crosswalks/Crosswalk_2000Census_toANES.dta
		assert occ2000==. if _merge==1
		drop if _merge==2
		drop _merge
	
		rename fatheroccej_2000 occej_from2000

	//Harmonized, coarsened father occupation variable 
	capture drop  fatheroccej  
	gen fatheroccej=.
	replace fatheroccej=occej_from1960 if inrange(year,1968,1993)
	replace fatheroccej=occej_from2000 if inrange(year,1995,2003)

********************************************************************************************************
* For respondents: Fix occupations for self-employed businessmen, managers, or officials before merging income scores * 
*******************************************************************************************************
	replace fatheroccej=21 if fatheroccej==28 & selfemployed==1 

	//Rename respondent and father occupations
	rename fatheroccej occRej 
	rename father_occ1950ej fatheroccej 

*------------------------------------------
* Head of hh dummies + father_imputed dummy	
*------------------------------------------
	gen headofhh_father = (inrange(Rwho_livew_age14,1,4)) if Rwho_livew_age14<.
	tab headofhh_father, m
	
	gen headofhh_mother = (Rwho_livew_age14==5) if Rwho_livew_age14<.
	tab headofhh_mother, m
	
	gen headofhh_othermale = (Rwho_livew_age14==6) if Rwho_livew_age14<.
	tab headofhh_othermale,m 
	
	gen headofhh_otherfemale = (Rwho_livew_age14==7) if Rwho_livew_age14<.
	tab headofhh_otherfemale,m 
	
	label var headofhh_father "Head of hh when R was growing up was R's father"
	label var headofhh_othermale "Head of hh when R was growing up was some other male (not R's father)"
	label var headofhh_otherfemale "Head of hh when R was growing up was some other female (not R's mother)"
	label var headofhh_mother "Head of hh when R was growing up was R's mother"

	
	//Create alternate dummy for hh head being R's father. 
	     /* Note: When R reports occupation of their father but 
	              does not specify who they lived with when
	              growing up, will assume that R lived with father.

	              Specific to NLSYW68: Father occupation is always 
	                                   missing when there's no info 
	                                   about who R lived with at age 14.
	                                   Therefore, no difference between
	                                   headofhh_father_imputed and 
	                                   headofhh_father.
	     */
	tab fatheroccej if Rwho_livew_age14==., m 
	gen headofhh_father_imputed = headofhh_father
	label var headofhh_father_imputed "Impute dad when parent occ != missing & no info about hh head at age 16"	

*-----------------------------------------------------------------------------------------------------
*  DUMMIES FOR WHEN WE KNOW WHY DAD OR MOM DIDN'T WORK (I.E. WHY FATHEROCCEJ/MOTHEROCCEJ IS MISSING)
*-----------------------------------------------------------------------------------------------------

	gen father_notworking =.
	replace father_notworking =1 if fatheroccej==. & workdad_Rage14==0 & inlist(Rwho_livew_age14,1,2,3,4,6) 
	replace father_notworking =0 if fatheroccej!=.
	tab father_notworking,m 
	
	gen mother_notworking =.
	replace mother_notworking =1 if motheroccej2==. & workmom_Rage14==4 & inlist(Rwho_livew_age14,1,2,3,5,7) 
	replace mother_notworking =0 if motheroccej2!=.
	tab mother_notworking, m 

	/*Note: There is a skip-logic to the parental occu[ation 
	        questions. Anyone who answered "no" to the previous 
	        question---"did your mom/mother figure or father/father 
	        figure work for pay when you were 14 years old?"----
	        will have a missing value for occupation. */
	
* Variable for father being either farm laborer or operator
	gen fatherfarm=0
	replace fatherfarm=. if fatheroccej==.
	replace fatherfarm=1 if fatheroccej==71 | fatheroccej==81

*------------------------------
* Family Income
*------------------------------
* Respondents
	label var fam_inc "R's Family Income, binned (based on midpoints of each bin)" 

		
*------------------------------------------------------------------------ 
* Save a dataset here for Appendix E exercise (drop table)
*------------------------------------------------------------------------ 

	save "./wrkdata/nlsyw68_4droptable.dta" , replace

*------------------------------------------------------------------------ 
* Jácome et al. vs Mazumder et al benchmarking exercise
*------------------------------------------------------------------------ 

	preserve
		ren motheroccej2 motheroccej 

		ren id id_daughter
		keep id_daughter fatheroccej motheroccej father_notworking grewup_south dob
		sort id_daughter

		//Preliminary step: confirm that fatheroccej, motheroccej, and grewup_south are constant by id
		foreach var of varlist fatheroccej motheroccej father_notworking grewup_south dob	{
			if "`var'"=="fatheroccej" local num 1
			if "`var'"=="motheroccej" local num 2
			if "`var'"=="grewup_south" local num 3
			if "`var'"=="dob" local num 4 
			if "`var'"=="father_notworking" local num 5 

			egen tag`num' = tag(id_daughter `var'), missing //Tag each distinct value of var, by unique id
			by id_daughter: egen tag`num'_total = total(tag`num') 
			tab tag`num'_total, m //Confirmed: only 1 value of each variable for each unique id

			ren `var' `var'_nlsyw68
		}
		drop tag*
		
		//Keep only one record for each unique id 
		bysort id_daughter: keep if _n==1
		count 
		save "./wrkdata/MD_benchmarkingexercise_nlsyw68.dta", replace
	restore

*------------------------------------------------------------------------ 
*  OUR SAMPLE SELECTION (to harmonize across datasets)
*------------------------------------------------------------------------ 

*Restrictions
	keep if (foreignborn==0 | foreignborn==.)
	gen oldwomen=age>40 & year>=1995

	rename weight weight_nlsyw68
	sort id year

save "./wrkdata/nlsyw68_foranalysis.dta", replace

*------------------------------------------------------------------------ 
*		Create a cross section
*------------------------------------------------------------------------ 

use "./wrkdata/nlsyw68_foranalysis.dta", clear
capture drop __*

*More restrictions
	sort id year
	keep if year==1988 & inrange(age,30,50)
	keep if interviewed1988==1
	
	unique id 
	duplicates list id  //0 duplicates

	rename region_residence south_residence
	
	//Preferred mother occupation variable
		/* Note: We'll select the 1978 version, 
		         as more respondents responded
		         to this version. */
	ren motheroccej2 motheroccej
	drop motheroccej1
	
* Convert family income variable to 1950 dollars
	gen year_CPI = year-1
	merge m:1 year_CPI using "../CPI/CPI_deflator.dta"
	drop if _merge==2
	drop _merge
	
	gen fam_inc_real =.
	replace fam_inc_real = fam_inc * deflator 
	label var fam_inc_real "Family income (bins), in 1950 dollars"

	gen lnfaminc = ln(fam_inc_real)
	label var lnfaminc "Logged family income (bins)"

	drop CPI year_CPI deflator

* Save 
	capture drop __*
	save "./wrkdata/nlsyw68_newxsec.dta", replace

